1 Functions

2 Data

# read processed data:
SCC1_cancer  = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC1/SCC1_cancer.RDS")
SCC2_cancer  = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC2/SCC2_cancer_processed.RDS")
SCC3_cancer = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC3/SCC3_cancer_processed.RDS")
SCC4_cancer = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC4/SCC4_cancer_processed.RDS")
SCC5_cancer  = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC5/SCC5_cancer_processed.RDS")

scc.big <- merge(SCC1_cancer, y = c(SCC2_cancer,SCC3_cancer, SCC4_cancer,SCC5_cancer), add.cell.ids = c("SCC1","SCC2", "SCC3", "SCC4","SCC5"), project = "SCC")
scc.big$orig.ident =  sapply(X = strsplit(colnames(scc.big), split = "_"), FUN = "[", 1)


VlnPlot(object = scc.big,features = "MYB",group.by = "orig.ident",slot = "data", assay = "RNA")
Warning in grSoftVersion() :
  unable to load shared object '/usr/local/lib/R/modules//R_X11.so':
  libXt.so.6: cannot open shared object file: No such file or directory

myb_data = FetchData(object = scc.big,vars = c("MYB","orig.ident"),slot = "data", assay = "RNA")
myb_data %>%   group_by(orig.ident) %>% 
  summarise(counts = sum(MYB > 0, na.rm = TRUE))

3 Patients filter

scc_myb_patients<- subset(scc.big, subset = orig.ident %in% c("SCC3","SCC4","SCC5"))
library(rstatix)
myb_vs_hpv = FetchData(object = scc_myb_patients, vars = c("hpv_positive", "MYB"))
df = reshape2::melt(myb_vs_hpv,value.name = "logTPM") %>% dplyr::rename(gene = variable)
Using hpv_positive as id variables
stat.test <- df %>%
  wilcox_test(logTPM ~ hpv_positive)

stat.test



p = ggplot(myb_vs_hpv,aes( x = hpv_positive, y = MYB)) +geom_boxplot(width=.1,outlier.shape = NA) +
  theme_minimal()+
  stat_summary(fun.data = function(x) data.frame(y=0.35, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("average MYB")+ggtitle("CECS") + xlab("HPV status")+
  stat_pvalue_manual(stat.test,y.position = 0.43, label = "Wilcox, p = {p}",remove.bracket = F,bracket.shorten = 0.1,tip.length = 0.01)+coord_cartesian(ylim = c(0,0.5))
  


p

```r
pdf(file = \./Figures/SCC_MYB_HPV_boxplot_all_patients.pdf\,height = 4)
p
dev.off()

<!-- rnb-source-end -->

<!-- rnb-output-begin eyJkYXRhIjoibnVsbCBkZXZpY2UgXG4gICAgICAgICAgMSBcbiJ9 -->

null device 1




<!-- rnb-output-end -->

<!-- rnb-chunk-end -->


<!-- rnb-text-begin -->



<!-- rnb-text-end -->


<!-- rnb-chunk-begin -->


<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuIyBEZXNjcmlwdGlvblxuXG5teWJfdnNfaHB2ID0gRmV0Y2hEYXRhKG9iamVjdCA9IHNjY19teWJfcGF0aWVudHMsIHZhcnMgPSBjKFwiSFBWX3JlYWRzXCIsIFwiTVlCXCIpKVxuc3AgPC0gZ2dzY2F0dGVyKG15Yl92c19ocHYsIHggPSBcIkhQVl9yZWFkc1wiLCB5ID0gXCJNWUJcIixcbiAgIGFkZCA9IFwicmVnLmxpbmVcIiwgICMgQWRkIHJlZ3Jlc3NpbiBsaW5lXG4gICBhZGQucGFyYW1zID0gbGlzdChjb2xvciA9IFwiYmx1ZVwiLCBmaWxsID0gXCJsaWdodGdyYXlcIiksICMgQ3VzdG9taXplIHJlZy4gbGluZVxuICAgY29uZi5pbnQgPSBUUlVFICMgQWRkIGNvbmZpZGVuY2UgaW50ZXJ2YWxcbiAgIClcbiMgQWRkIGNvcnJlbGF0aW9uIGNvZWZmaWNpZW50XG5zcCArIHN0YXRfY29yKG1ldGhvZCA9IFwicGVhcnNvblwiLCBsYWJlbC54ID0gMywgbGFiZWwueSA9IDUpXG5gYGAifQ== -->

```r
# Description

myb_vs_hpv = FetchData(object = scc_myb_patients, vars = c("HPV_reads", "MYB"))
sp <- ggscatter(myb_vs_hpv, x = "HPV_reads", y = "MYB",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp + stat_cor(method = "pearson", label.x = 3, label.y = 5)
`geom_smooth()` using formula 'y ~ x'

4 HMSC HPV DEG boxplots

library(rstatix)

hmsc_deg = read.table(file = "./Data_out/HPV_analysis/hpv_deg_df.csv",row.names = 1)
top_genes = hmsc_deg %>% filter(.$fdr<0.05) %>% filter(.$avg_diff>log2(1.5)) %>%  arrange(dplyr::desc(.$avg_diff)) %>% head(5) %>% rownames()
top_genes = c(top_genes, "MYB", "JAG1")

myb_vs_hpv = FetchData(object = scc_myb_patients,vars = c("hpv_positive",top_genes))
df = reshape2::melt(myb_vs_hpv,value.name = "logTPM") %>% dplyr::rename(gene = variable)
Using hpv_positive as id variables
stat.test <- df %>%
    group_by(gene) %>%
  wilcox_test(logTPM ~ hpv_positive) %>%
  mutate(y.position = 5)

stat.test

stat.test <- stat.test %>% 
  add_xy_position(x = "gene", dodge = 0.8)

p = ggviolin(
  df,
  x = "gene",
  y = "logTPM",
  fill = "hpv_positive",
  palette = "jco",scale = 'width',trim = T,
  add = "boxplot"
)+ stat_pvalue_manual(stat.test,y.position = 4, label = "p = {p}",remove.bracket = F)

p



 ggplot(df, aes(x = gene, y = logTPM, group = interaction(gene,hpv_positive))) +
   geom_violin(scale = 'width',trim=T,position = position_dodge(0.8),aes(fill =hpv_positive)) +
   geom_boxplot(position = position_dodge(0.8), outlier.shape = NA, coef=0,width = 0.15, fill="white")+ 
   stat_pvalue_manual(stat.test,y.position = 3.2, label = "p = {p}",remove.bracket = F)+
  stat_summary(aes(fill =hpv_positive),inherit.aes = T,fun.data = function(x) data.frame(y=3.8, label = round(mean(x),digits = 2)), geom="text",position = position_dodge(0.8))
Warning: Ignoring unknown aesthetics: fill

 
 
  ggplot(df, aes(x = gene, y = logTPM)) +
   geom_boxplot(position = position_dodge(0.8), coef=0,width = 0.75,aes(fill =hpv_positive))+
  stat_summary(aes(fill =hpv_positive),inherit.aes = T,fun.data = function(x) data.frame(y=3.8, label = round(mean(x),digits = 2)), geom="text",position = position_dodge(0.8)) + 
   stat_pvalue_manual(stat.test,y.position = 3.5, label = "p = {p}",remove.bracket = F)
Warning: Ignoring unknown aesthetics: fill

NA
NA
NA
```r
pdf(file = \./Figures/SCC_HPV_genes_all_patients.pdf\,width = 8)
p
dev.off()

<!-- rnb-source-end -->

<!-- rnb-chunk-end -->


<!-- rnb-text-begin -->


# split violin

<!-- rnb-text-end -->


<!-- rnb-chunk-begin -->


<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxubWV0YWdlbmVzX3Zpb2xpbl9jb21wYXJlLjIgPSBmdW5jdGlvbihkYXRhc2V0LHByZWZpeCA9IFwiXCIscHJlX29uID0gYyhcIk9TSVwiLFwiTlRcIiksYXhpcy50ZXh0LnggPSAxMSx0ZXN0ID0gXCJ0LnRlc3RcIiwgcHJvZ3JhbXMgPSBjKFwiSHlwb3hpYVwiLFwiVE5GYVwiLFwiQ2VsbF9jeWNsZVwiKSxyZXR1cm5fbGlzdCA9IEYsY29tYmluZV9wYXRpZW50cyA9IEYpe1xuICByZXF1aXJlKGZhY2VmdW5zKVxuICBwbHQubHN0ID0gbGlzdCgpXG4gIGlmKGNvbWJpbmVfcGF0aWVudHMpe1xuICAgIGdlbmVzX2J5X3RwID0gRmV0Y2hEYXRhKG9iamVjdCA9IGRhdGFzZXQsdmFycyA9ICBjKFwidHJlYXRtZW50XCIscHJvZ3JhbXMpKSAlPiUgZmlsdGVyKHRyZWF0bWVudCAlaW4lIHByZV9vbikgICU+JSBhcy5kYXRhLmZyYW1lKCkgI21lYW4gZXhwcmVzc2lvblxuICAgIGZvcm11bGEgPC0gYXMuZm9ybXVsYSggcGFzdGUoXCJjKFwiLCBwYXN0ZShwcm9ncmFtcywgY29sbGFwc2UgPSBcIixcIiksIFwiKX4gdHJlYXRtZW50IFwiKSApXG4gICAgXG4gICAgI3Bsb3QgYW5kIHNwbGl0IGJ5IHBhdGllbnQ6ICAgXG4gICAgc3RhdC50ZXN0ID0gY29tcGFyZV9tZWFucyhmb3JtdWxhID0gZm9ybXVsYSAsZGF0YSA9IGdlbmVzX2J5X3RwLG1ldGhvZCA9IHRlc3QscC5hZGp1c3QubWV0aG9kID0gXCJmZHJcIiklPiUgIyBBZGQgcGFpcndpc2UgY29tcGFyaXNvbnMgcC12YWx1ZVxuICAgICAgZHBseXI6OmZpbHRlcihncm91cDEgPT0gcHJlX29uWzFdICYgZ3JvdXAyID09IHByZV9vblsyXSkgICNmaWx0ZXIgZm9yIHByZSB2cyBvbiB0cmVhdG1lbnQgb25seVxuICAgIFxuICAgIHN0YXQudGVzdCRwLmZvcm1hdCA9c3RhdC50ZXN0JHAuYWRqICNtb2RpZnQgMCBwdmFsdWUgdG8gYmUgbG93ZXN0IHBvc3NpYmxlIGZsb2F0XG4gICAgc3RhdC50ZXN0JHAuZm9ybWF0WyFzdGF0LnRlc3QkcC5mb3JtYXQgPT0gMCBdIDwtIHBhc3RlKFwiPVwiLHN0YXQudGVzdCRwLmZvcm1hdFshc3RhdC50ZXN0JHAuZm9ybWF0ID09IDAgXSlcbiAgICBzdGF0LnRlc3QkcC5mb3JtYXRbc3RhdC50ZXN0JHAuZm9ybWF0ID09IDAgXSA8LSBwYXN0ZShcIjxcIiwuTWFjaGluZSRkb3VibGUueG1pbiAlPiUgc2lnbmlmKGRpZ2l0cyA9IDMpKVxuICAgIFxuICAgIFxuICAgIGdlbmVzX2J5X3RwID0gcmVzaGFwZTI6Om1lbHQoZ2VuZXNfYnlfdHAsIGlkLnZhcnMgPSBjKFwidHJlYXRtZW50XCIpLHZhbHVlLm5hbWUgPSBcInNjb3JlXCIpXG4gICAgcGx0ID0gZ2dwbG90KGdlbmVzX2J5X3RwLCBhZXMoeCA9IHZhcmlhYmxlLCB5ID0gc2NvcmUsZmlsbCA9IHRyZWF0bWVudCkpICsgZ2VvbV9zcGxpdF92aW9saW4oc2NhbGUgPSAnd2lkdGgnKSsgXG4gICAgICBnZW9tX2JveHBsb3Qod2lkdGggPSAwLjI1LCBub3RjaCA9IEZBTFNFLCBub3RjaHdpZHRoID0gLjQsIG91dGxpZXIuc2hhcGUgPSBOQSwgY29lZj0wKStcbiAgICAgIHlsaW0obWluKGdlbmVzX2J5X3RwJHNjb3JlKSxtYXgoZ2VuZXNfYnlfdHAkc2NvcmUpKjEuMjUpXG4gICAgcGx0ID0gcGx0ICtzdGF0X3B2YWx1ZV9tYW51YWwoc3RhdC50ZXN0LCBsYWJlbCA9IFwiYXNkID0ge3Auc2lnbmlmfVwiLCBsYWJlbC5zaXplID0gNywgICNhZGQgcCB2YWx1ZVxuICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHkucG9zaXRpb24gPSAxLGluaGVyaXQuYWVzID0gRixzaXplID0gMy4zLHggPSBcIi55LlwiKSAjIHNldCBwb3NpdGlvbiBhdCB0aGUgdG9wIHZhbHVlXG4gICAgcmV0dXJuKHBsdClcbiAgfVxuICBcbiAgXG4gIFxuICBcbiAgXG4gIGlmIChyZXR1cm5fbGlzdCkge1xuICAgIHJldHVybihwbHQubHN0KVxuICB9XG59XG5gYGAifQ== -->

```r
metagenes_violin_compare.2 = function(dataset,prefix = "",pre_on = c("OSI","NT"),axis.text.x = 11,test = "t.test", programs = c("Hypoxia","TNFa","Cell_cycle"),return_list = F,combine_patients = F){
  require(facefuns)
  plt.lst = list()
  if(combine_patients){
    genes_by_tp = FetchData(object = dataset,vars =  c("treatment",programs)) %>% filter(treatment %in% pre_on)  %>% as.data.frame() #mean expression
    formula <- as.formula( paste("c(", paste(programs, collapse = ","), ")~ treatment ") )
    
    #plot and split by patient:   
    stat.test = compare_means(formula = formula ,data = genes_by_tp,method = test,p.adjust.method = "fdr")%>% # Add pairwise comparisons p-value
      dplyr::filter(group1 == pre_on[1] & group2 == pre_on[2])  #filter for pre vs on treatment only
    
    stat.test$p.format =stat.test$p.adj #modift 0 pvalue to be lowest possible float
    stat.test$p.format[!stat.test$p.format == 0 ] <- paste("=",stat.test$p.format[!stat.test$p.format == 0 ])
    stat.test$p.format[stat.test$p.format == 0 ] <- paste("<",.Machine$double.xmin %>% signif(digits = 3))
    
    
    genes_by_tp = reshape2::melt(genes_by_tp, id.vars = c("treatment"),value.name = "score")
    plt = ggplot(genes_by_tp, aes(x = variable, y = score,fill = treatment)) + geom_split_violin(scale = 'width')+ 
      geom_boxplot(width = 0.25, notch = FALSE, notchwidth = .4, outlier.shape = NA, coef=0)+
      ylim(min(genes_by_tp$score),max(genes_by_tp$score)*1.25)
    plt = plt +stat_pvalue_manual(stat.test, label = "asd = {p.signif}", label.size = 7,  #add p value
                                  y.position = 1,inherit.aes = F,size = 3.3,x = ".y.") # set position at the top value
    return(plt)
  }
  
  
  
  
  
  if (return_list) {
    return(plt.lst)
  }
}
scc_myb_patients$treatment = scc_myb_patients$hpv_positive %>% gsub(pattern = "negative",replacement = "HPV-negative")%>% gsub(pattern = "positive",replacement = "HPV-positive")
scc_myb_patients@meta.data[["treatment"]] = factor(scc_myb_patients$treatment, levels = c("HPV-positive", "HPV-negative"))

p = metagenes_violin_compare.2(dataset = scc_myb_patients, prefix = "patient",pre_on = c("HPV-negative","HPV-positive"),test = "wilcox.test",programs = genes, return_list = F,combine_patients = T) +scale_y_continuous(limits = c(0,1.5)) + labs(fill = "")+ylab("LogTPM")+xlab("Gene")+theme(axis.title=element_text(size=14))+ scale_fill_manual(values=c("#F8766D", "cyan3"))
Loading required package: facefuns
Warning: `gather_()` was deprecated in tidyr 1.2.0.
Please use `gather()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
p
Warning: Removed 1036 rows containing non-finite values (stat_ydensity).
Warning: Removed 1036 rows containing non-finite values (stat_boxplot).

```r
pdf(file = \./Figures/SCC_HPV_Top_violin.pdf\,width = 10,height = 5)
p

<!-- rnb-source-end -->

<!-- rnb-output-begin eyJkYXRhIjoiV2FybmluZzogUmVtb3ZlZCAzOTIgcm93cyBjb250YWluaW5nIG5vbi1maW5pdGUgdmFsdWVzIChzdGF0X3lkZW5zaXR5KS5cbldhcm5pbmc6IFJlbW92ZWQgMzkyIHJvd3MgY29udGFpbmluZyBub24tZmluaXRlIHZhbHVlcyAoc3RhdF9ib3hwbG90KS5cbiJ9 -->

Warning: Removed 392 rows containing non-finite values (stat_ydensity). Warning: Removed 392 rows containing non-finite values (stat_boxplot).




<!-- rnb-output-end -->

<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxuZGV2Lm9mZigpXG5gYGBcbmBgYCJ9 -->

```r
```r
dev.off()

<!-- rnb-source-end -->

<!-- rnb-output-begin eyJkYXRhIjoibnVsbCBkZXZpY2UgXG4gICAgICAgICAgMSBcbiJ9 -->

null device 1




<!-- rnb-output-end -->

<!-- rnb-chunk-end -->


<!-- rnb-text-begin -->



<!-- rnb-text-end -->


<!-- rnb-chunk-begin -->


<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxubXlwbG90cyA8LSBsaXN0KCkgICMgbmV3IGVtcHR5IGxpc3RcblxuZm9yIChwYXRpZW50X25hbWUgaW4gYyhcXFNDQzJcXCwgXFxTQ0MzXFwsIFxcU0NDNFxcLCBcXFNDQzVcXCkpIHtcbiAgcGF0aWVudF9kYXRhID0gc3Vic2V0KHggPSBzY2NfbXliX3BhdGllbnRzLCBzdWJzZXQgPSBvcmlnLmlkZW50ID09IHBhdGllbnRfbmFtZSlcbiAgZGYgID0gRmV0Y2hEYXRhKG9iamVjdCA9IHBhdGllbnRfZGF0YSxcbiAgICAgICAgICAgICAgICAgIHZhcnMgPSBjKFxcaHB2X3Bvc2l0aXZlXFwsIFxcTVlCX3Bvc2l0aXZlXFwpKSAlPiUgZHJvcGxldmVscygpXG4gIHRlc3QgPSBmaXNoZXJfdGVzdCh0YWJsZShkZikpXG4gIHAgPSBnZ2JhcnN0YXRzKFxuICAgIGRmLFxuICAgIE1ZQl9wb3NpdGl2ZSxcbiAgICBocHZfcG9zaXRpdmUsXG4gICAgcmVzdWx0cy5zdWJ0aXRsZSA9IEZBTFNFLFxuICAgIHN1YnRpdGxlID0gcGFzdGUwKFxcRmlzaGVyJ3MgZXhhY3QgdGVzdFxcLCBcXFxuYGBgIn0= -->

```r
```r
myplots <- list()  # new empty list

for (patient_name in c(\SCC2\, \SCC3\, \SCC4\, \SCC5\)) {
  patient_data = subset(x = scc_myb_patients, subset = orig.ident == patient_name)
  df  = FetchData(object = patient_data,
                  vars = c(\hpv_positive\, \MYB_positive\)) %>% droplevels()
  test = fisher_test(table(df))
  p = ggbarstats(
    df,
    MYB_positive,
    hpv_positive,
    results.subtitle = FALSE,
    subtitle = paste0(\Fisher's exact test\, \

scc_myb_patients = SetIdent(scc_myb_patients,value = "hpv_positive")
scc_myb_patients = FindVariableFeatures(object = scc_myb_patients,nfeatures = 10000)
features = VariableFeatures(scc_myb_patients)

deg <-
  FindMarkers(
    scc_myb_patients,
    ident.1 = "positive",
    ident.2 = "negative",
    features = features,
    densify = T,
    assay = "RNA",
    logfc.threshold = 0,
    min.pct = 0.1,
    only.pos = F,
    mean.fxn = function(x) {
      return(log(rowMeans(x) + 1, base = 2)) # change func to calculate logFC in log space data (default to exponent data)
    }
  )
deg$fdr<-p.adjust(p = as.vector(deg$p_val) ,method = "fdr" )

5 Find DEG

suffix = "5Kvargenes"
scc_myb_patients = SetIdent(scc_myb_patients,value = "hpv_positive")
scc_myb_patients = FindVariableFeatures(object = scc_myb_patients,nfeatures = 5000)
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
features = VariableFeatures(scc_myb_patients)

deg = FindMarkers(object = scc_myb_patients,ident.1 = "positive",ident.2 = "negative",
            features = features,test.use = "LR",latent.vars = "orig.ident",
            logfc.threshold = 0,min.pct = 0.1,
            mean.fxn = function(x) {
              return((rowMeans(x)+1)) # change func to calculate logFC in log space data (default to exponent data)
            })

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~01m 14s      
  |++                                                | 2 % ~01m 10s      
  |++                                                | 3 % ~01m 08s      
  |+++                                               | 4 % ~01m 08s      
  |+++                                               | 5 % ~01m 07s      
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

  |++++                                              | 6 % ~01m 06s      
  |++++                                              | 7 % ~01m 04s      
  |+++++                                             | 8 % ~01m 02s      
  |+++++                                             | 9 % ~01m 01s      
  |++++++                                            | 10% ~01m 01s      
  |++++++                                            | 11% ~01m 01s      
  |+++++++                                           | 12% ~01m 00s      
  |+++++++                                           | 13% ~59s          
  |++++++++                                          | 14% ~59s          
  |++++++++                                          | 15% ~57s          
  |+++++++++                                         | 16% ~57s          
  |+++++++++                                         | 17% ~56s          
  |++++++++++                                        | 18% ~55s          
  |++++++++++                                        | 19% ~54s          
  |+++++++++++                                       | 20% ~53s          
  |+++++++++++                                       | 21% ~52s          
  |++++++++++++                                      | 22% ~52s          
  |++++++++++++                                      | 23% ~52s          
  |+++++++++++++                                     | 24% ~51s          
  |+++++++++++++                                     | 25% ~51s          
  |++++++++++++++                                    | 26% ~50s          
  |++++++++++++++                                    | 27% ~50s          
  |+++++++++++++++                                   | 28% ~49s          
  |+++++++++++++++                                   | 29% ~48s          
  |++++++++++++++++                                  | 30% ~47s          
  |++++++++++++++++                                  | 31% ~46s          
  |+++++++++++++++++                                 | 32% ~46s          
  |+++++++++++++++++                                 | 33% ~45s          
  |++++++++++++++++++                                | 34% ~44s          
  |++++++++++++++++++                                | 35% ~44s          
  |+++++++++++++++++++                               | 36% ~43s          
  |+++++++++++++++++++                               | 37% ~42s          
  |++++++++++++++++++++                              | 38% ~42s          
  |++++++++++++++++++++                              | 39% ~41s          
  |+++++++++++++++++++++                             | 40% ~40s          
  |+++++++++++++++++++++                             | 41% ~40s          
  |++++++++++++++++++++++                            | 42% ~39s          
  |++++++++++++++++++++++                            | 43% ~38s          
  |+++++++++++++++++++++++                           | 44% ~38s          
  |+++++++++++++++++++++++                           | 45% ~37s          
  |++++++++++++++++++++++++                          | 46% ~36s          
  |++++++++++++++++++++++++                          | 47% ~35s          
  |+++++++++++++++++++++++++                         | 48% ~35s          
  |+++++++++++++++++++++++++                         | 49% ~34s          
  |++++++++++++++++++++++++++                        | 51% ~33s          
  |++++++++++++++++++++++++++                        | 52% ~32s          
  |+++++++++++++++++++++++++++                       | 53% ~32s          
  |+++++++++++++++++++++++++++                       | 54% ~31s          
  |++++++++++++++++++++++++++++                      | 55% ~30s          
  |++++++++++++++++++++++++++++                      | 56% ~30s          
  |+++++++++++++++++++++++++++++                     | 57% ~29s          
  |+++++++++++++++++++++++++++++                     | 58% ~28s          
  |++++++++++++++++++++++++++++++                    | 59% ~28s          
  |++++++++++++++++++++++++++++++                    | 60% ~27s          
  |+++++++++++++++++++++++++++++++                   | 61% ~26s          
  |+++++++++++++++++++++++++++++++                   | 62% ~26s          
  |++++++++++++++++++++++++++++++++                  | 63% ~25s          
  |++++++++++++++++++++++++++++++++                  | 64% ~24s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~24s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~23s          
  |++++++++++++++++++++++++++++++++++                | 67% ~22s          
  |++++++++++++++++++++++++++++++++++                | 68% ~22s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~21s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~20s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~19s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~19s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~18s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~17s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~17s          
  |++++++++++++++++++++++++++++++++++++++            | 76% ~16s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~15s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~15s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~14s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~13s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~13s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~12s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~11s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~11s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~10s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~09s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~09s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~08s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~07s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~07s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~06s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~05s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~05s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~04s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~03s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~03s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~02s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01m 06s
deg$fdr<-p.adjust(p = as.vector(deg$p_val) ,method = "fdr")
data_to_save = deg %>% dplyr::rename(avg_diff = avg_log2FC) #rename avg_log2FC because here we calculate diff
saveRDS(object = data_to_save,file = "./Data_out/scc_deg_5Kvargenes.rds")
---
title: '`r rstudioapi::getSourceEditorContext()$path %>% basename() %>% gsub(pattern = "\\.Rmd",replacement = "")`' 
author: "Avishai Wizel"
date: '`r Sys.time()`'
output: 
  html_notebook: 
    code_folding: hide
    toc: yes
    toc_collapse: yes
    toc_float: 
      collapsed: FALSE
    number_sections: true
    toc_depth: 1
---



# Functions

```{r warning=FALSE}
```

```{r}
```

# Data


```{r}
# read processed data:
SCC1_cancer  = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC1/SCC1_cancer.RDS")
SCC2_cancer  = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC2/SCC2_cancer_processed.RDS")
SCC3_cancer = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC3/SCC3_cancer_processed.RDS")
SCC4_cancer = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC4/SCC4_cancer_processed.RDS")
SCC5_cancer  = readRDS(file = "./Data/cervical_cancer_data/Qiu et al/SCC5/SCC5_cancer_processed.RDS")

scc.big <- merge(SCC1_cancer, y = c(SCC2_cancer,SCC3_cancer, SCC4_cancer,SCC5_cancer), add.cell.ids = c("SCC1","SCC2", "SCC3", "SCC4","SCC5"), project = "SCC")
scc.big$orig.ident =  sapply(X = strsplit(colnames(scc.big), split = "_"), FUN = "[", 1)


VlnPlot(object = scc.big,features = "MYB",group.by = "orig.ident",slot = "data", assay = "RNA")


```


```{r}
myb_data = FetchData(object = scc.big,vars = c("MYB","orig.ident"),slot = "data", assay = "RNA")
myb_data %>%   group_by(orig.ident) %>% 
  summarise(counts = sum(MYB > 0, na.rm = TRUE))
```


# Patients filter
```{r fig.height=6, fig.width=12}
scc_myb_patients<- subset(scc.big, subset = orig.ident %in% c("SCC3","SCC4","SCC5"))
```


```{r}
library(rstatix)
myb_vs_hpv = FetchData(object = scc_myb_patients, vars = c("hpv_positive", "MYB"))
df = reshape2::melt(myb_vs_hpv,value.name = "logTPM") %>% dplyr::rename(gene = variable)

stat.test <- df %>%
  wilcox_test(logTPM ~ hpv_positive)

stat.test



p = ggplot(myb_vs_hpv,aes( x = hpv_positive, y = MYB)) +geom_boxplot(width=.1,outlier.shape = NA) +
  theme_minimal()+
  stat_summary(fun.data = function(x) data.frame(y=0.35, label = paste("Mean=",round(mean(x),digits = 2))), geom="text") +ylab("average MYB")+ggtitle("CECS") + xlab("HPV status")+
  stat_pvalue_manual(stat.test,y.position = 0.43, label = "Wilcox, p = {p}",remove.bracket = F,bracket.shorten = 0.1,tip.length = 0.01)+coord_cartesian(ylim = c(0,0.5))
  


p
```
```{r}
pdf(file = "./Figures/SCC_MYB_HPV_boxplot_all_patients.pdf",height = 4)
p
dev.off()
```

```{r}
# Description

myb_vs_hpv = FetchData(object = scc_myb_patients, vars = c("HPV_reads", "MYB"))
sp <- ggscatter(myb_vs_hpv, x = "HPV_reads", y = "MYB",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp + stat_cor(method = "pearson", label.x = 3, label.y = 5)

```


# HMSC HPV DEG boxplots
```{r fig.height=5, fig.width=12}
library(rstatix)

hmsc_deg = read.table(file = "./Data_out/HPV_analysis/hpv_deg_df.csv",row.names = 1)
top_genes = hmsc_deg %>% filter(.$fdr<0.05) %>% filter(.$avg_diff>log2(1.5)) %>%  arrange(dplyr::desc(.$avg_diff)) %>% head(5) %>% rownames()
top_genes = c(top_genes, "MYB", "JAG1")

myb_vs_hpv = FetchData(object = scc_myb_patients,vars = c("hpv_positive",top_genes))
df = reshape2::melt(myb_vs_hpv,value.name = "logTPM") %>% dplyr::rename(gene = variable)


stat.test <- df %>%
    group_by(gene) %>%
  wilcox_test(logTPM ~ hpv_positive) %>%
  mutate(y.position = 5)

stat.test

stat.test <- stat.test %>% 
  add_xy_position(x = "gene", dodge = 0.8)

p = ggviolin(
  df,
  x = "gene",
  y = "logTPM",
  fill = "hpv_positive",
  palette = "jco",scale = 'width',trim = T,
  add = "boxplot"
)+ stat_pvalue_manual(stat.test,y.position = 4, label = "p = {p}",remove.bracket = F)

p


 ggplot(df, aes(x = gene, y = logTPM, group = interaction(gene,hpv_positive))) +
   geom_violin(scale = 'width',trim=T,position = position_dodge(0.8),aes(fill =hpv_positive)) +
   geom_boxplot(position = position_dodge(0.8), outlier.shape = NA, coef=0,width = 0.15, fill="white")+ 
   stat_pvalue_manual(stat.test,y.position = 3.2, label = "p = {p}",remove.bracket = F)+
  stat_summary(aes(fill =hpv_positive),inherit.aes = T,fun.data = function(x) data.frame(y=3.8, label = round(mean(x),digits = 2)), geom="text",position = position_dodge(0.8))
 
 
  ggplot(df, aes(x = gene, y = logTPM)) +
   geom_boxplot(position = position_dodge(0.8), coef=0,width = 0.75,aes(fill =hpv_positive))+
  stat_summary(aes(fill =hpv_positive),inherit.aes = T,fun.data = function(x) data.frame(y=3.8, label = round(mean(x),digits = 2)), geom="text",position = position_dodge(0.8)) + 
   stat_pvalue_manual(stat.test,y.position = 3.5, label = "p = {p}",remove.bracket = F)
 
  

```


```{r}
pdf(file = "./Figures/SCC_HPV_genes_all_patients.pdf",width = 8)
p
dev.off()
```

# split violin
```{r}
metagenes_violin_compare.2 = function(dataset,prefix = "",pre_on = c("OSI","NT"),axis.text.x = 11,test = "t.test", programs = c("Hypoxia","TNFa","Cell_cycle"),return_list = F,combine_patients = F){
  require(facefuns)
  plt.lst = list()
  if(combine_patients){
    genes_by_tp = FetchData(object = dataset,vars =  c("treatment",programs)) %>% filter(treatment %in% pre_on)  %>% as.data.frame() #mean expression
    formula <- as.formula( paste("c(", paste(programs, collapse = ","), ")~ treatment ") )
    
    #plot and split by patient:   
    stat.test = compare_means(formula = formula ,data = genes_by_tp,method = test,p.adjust.method = "fdr")%>% # Add pairwise comparisons p-value
      dplyr::filter(group1 == pre_on[1] & group2 == pre_on[2])  #filter for pre vs on treatment only
    
    stat.test$p.format =stat.test$p.adj #modift 0 pvalue to be lowest possible float
    stat.test$p.format[!stat.test$p.format == 0 ] <- paste("=",stat.test$p.format[!stat.test$p.format == 0 ])
    stat.test$p.format[stat.test$p.format == 0 ] <- paste("<",.Machine$double.xmin %>% signif(digits = 3))
    
    
    genes_by_tp = reshape2::melt(genes_by_tp, id.vars = c("treatment"),value.name = "score")
    plt = ggplot(genes_by_tp, aes(x = variable, y = score,fill = treatment)) + geom_split_violin(scale = 'width')+ 
      geom_boxplot(width = 0.25, notch = FALSE, notchwidth = .4, outlier.shape = NA, coef=0)+
      ylim(min(genes_by_tp$score),max(genes_by_tp$score)*1.25)
    plt = plt +stat_pvalue_manual(stat.test, label = "asd = {p.signif}", label.size = 7,  #add p value
                                  y.position = 1,inherit.aes = F,size = 3.3,x = ".y.") # set position at the top value
    return(plt)
  }
  
  
  
  
  
  if (return_list) {
    return(plt.lst)
  }
}
```

```{r fig.height=5, fig.width=10}
scc_myb_patients$treatment = scc_myb_patients$hpv_positive %>% gsub(pattern = "negative",replacement = "HPV-negative")%>% gsub(pattern = "positive",replacement = "HPV-positive")
scc_myb_patients@meta.data[["treatment"]] = factor(scc_myb_patients$treatment, levels = c("HPV-positive", "HPV-negative"))

p = metagenes_violin_compare.2(dataset = scc_myb_patients, prefix = "patient",pre_on = c("HPV-negative","HPV-positive"),test = "wilcox.test",programs = top_genes, return_list = F,combine_patients = T) +scale_y_continuous(limits = c(0,1.5)) + labs(fill = "")+ylab("LogTPM")+xlab("Gene")+theme(axis.title=element_text(size=14))+ scale_fill_manual(values=c("#F8766D", "cyan3"))
p
```


```{r}
pdf(file = "./Figures/SCC_HPV_Top_violin.pdf",width = 10,height = 5)
p
dev.off()
```

```{r fig.height=7, fig.width=13}
myplots <- list()  # new empty list

for (patient_name in c("SCC2", "SCC3", "SCC4", "SCC5")) {
  patient_data = subset(x = scc_myb_patients, subset = orig.ident == patient_name)
  df  = FetchData(object = patient_data,
                  vars = c("hpv_positive", "MYB_positive")) %>% droplevels()
  test = fisher_test(table(df))
  p = ggbarstats(
    df,
    MYB_positive,
    hpv_positive,
    results.subtitle = FALSE,
    subtitle = paste0("Fisher's exact test", ", p-value = ",
                      test$p)
  ,title = patient_name)
  myplots[[patient_name]] <- p  # add each plot into plot list

}
ggarrange(plotlist = myplots,ncol = 4,nrow = 1)

```


```{r}
scc_myb_patients = SetIdent(scc_myb_patients,value = "hpv_positive")
scc_myb_patients = FindVariableFeatures(object = scc_myb_patients,nfeatures = 10000)
features = VariableFeatures(scc_myb_patients)

deg <-
  FindMarkers(
    scc_myb_patients,
    ident.1 = "positive",
    ident.2 = "negative",
    features = features,
    densify = T,
    assay = "RNA",
    logfc.threshold = 0,
    min.pct = 0.1,
    only.pos = F,
    mean.fxn = function(x) {
      return(log(rowMeans(x) + 1, base = 2)) # change func to calculate logFC in log space data (default to exponent data)
    }
  )
deg$fdr<-p.adjust(p = as.vector(deg$p_val) ,method = "fdr" )



```

# Find DEG 

```{r}
suffix = "5Kvargenes"
scc_myb_patients = SetIdent(scc_myb_patients,value = "hpv_positive")
scc_myb_patients = FindVariableFeatures(object = scc_myb_patients,nfeatures = 5000)
features = VariableFeatures(scc_myb_patients)

deg = FindMarkers(object = scc_myb_patients,ident.1 = "positive",ident.2 = "negative",
            features = features,test.use = "LR",latent.vars = "orig.ident",
            logfc.threshold = 0,min.pct = 0.1,
            mean.fxn = function(x) {
              return((rowMeans(x)+1)) # change func to calculate logFC in log space data (default to exponent data)
            })
deg$fdr<-p.adjust(p = as.vector(deg$p_val) ,method = "fdr")
data_to_save = deg %>% dplyr::rename(avg_diff = avg_log2FC) #rename avg_log2FC because here we calculate diff
saveRDS(object = data_to_save,file = "./Data_out/scc_deg_5Kvargenes.rds")
```




